{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom Interactions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook we will disable the automatic interaction detection built into the interpret API and instead detect the interactions ourselves and then incorporate them into the EBM. We will also detect and use 3-way interactions, which are typically not needed, but can sometimes be useful.\n",
    "\n",
    "This notebook can be found in our [**_examples folder_**](https://github.com/interpretml/interpret/tree/develop/docs/interpret/python/examples) on GitHub."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# install interpret if not already installed\n",
    "try:\n",
    "    import interpret\n",
    "except ModuleNotFoundError:\n",
    "    !pip install --quiet interpret pandas scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from interpret.glassbox import ExplainableBoostingClassifier\n",
    "\n",
    "from interpret import set_visualize_provider\n",
    "from interpret.provider import InlineProvider\n",
    "set_visualize_provider(InlineProvider())\n",
    "\n",
    "df = pd.read_csv(\n",
    "    \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
    "    header=None)\n",
    "df.columns = [\n",
    "    \"Age\", \"WorkClass\", \"fnlwgt\", \"Education\", \"EducationNum\",\n",
    "    \"MaritalStatus\", \"Occupation\", \"Relationship\", \"Race\", \"Gender\",\n",
    "    \"CapitalGain\", \"CapitalLoss\", \"HoursPerWeek\", \"NativeCountry\", \"Income\"\n",
    "]\n",
    "X = df.iloc[:, :-1]\n",
    "y = df.iloc[:, -1]\n",
    "\n",
    "seed = 42\n",
    "np.random.seed(seed)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Build a mains model</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ebm1 = ExplainableBoostingClassifier(random_state=seed, interactions=0)\n",
    "ebm1.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Determine pairs</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from interpret.utils import measure_interactions\n",
    "from itertools import combinations\n",
    "\n",
    "n_features = X_train.shape[1]\n",
    "\n",
    "pairs = measure_interactions(X_train, y_train, interactions=combinations(range(n_features), 2), init_score=ebm1)\n",
    "pairs = [interaction for interaction, strength in pairs[:10]]  # select the top 10 pairs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Build a pure pair model</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ebm2 = ExplainableBoostingClassifier(random_state=seed, exclude=\"mains\", interactions=pairs)\n",
    "ebm2.fit(X_train, y_train, init_score=ebm1)\n",
    "\n",
    "# modify ebm2 slightly to not have any bins without type definitions\n",
    "ebm2.bins_ = [l1 if len(l2) == 0 else l2 for l1, l2 in zip(ebm1.bins_, ebm2.bins_)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Merge the mains and pure pairs into a single model</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from interpret.glassbox import merge_ebms\n",
    "\n",
    "ebm_pairs = merge_ebms([ebm1, ebm2])\n",
    "\n",
    "# There is no overlap between these EBMs, so merge_ebms will consider\n",
    "# the non-overlapping terms as having zeros for scores in the other model. \n",
    "# Undo this by multiplying the scores by 2.0. Also reduce the bin_weights_ \n",
    "# since we're merging the same underlying features.\n",
    "for i in range(len(ebm_pairs.term_features_)):\n",
    "    ebm_pairs.scale(i, 2.0)\n",
    "    ebm_pairs.bin_weights_[i] *= 0.5\n",
    "\n",
    "# add intercepts since we're not trying to average the models\n",
    "ebm_pairs.intercept_ = ebm1.intercept_ + ebm2.intercept_\n",
    "\n",
    "ebm_pairs.bagged_intercept_ = None\n",
    "ebm_pairs.bagged_scores_ = None\n",
    "ebm_pairs.standard_deviations_ = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Determine triples</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "triples = measure_interactions(X_train, y_train, interactions=combinations(range(n_features), 3), init_score=ebm_pairs)\n",
    "triples = [interaction for interaction, strength in triples[:10]]  # select the top 10 triples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Build a pure triple EBM</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ebm3 = ExplainableBoostingClassifier(random_state=seed, exclude=\"mains\", interactions=triples)\n",
    "ebm3.fit(X_train, y_train, init_score=ebm_pairs)\n",
    "\n",
    "# modify ebm3 slightly to not have any bins without type definitions\n",
    "ebm3.bins_ = [l1 if len(l3) == 0 else l3 for l1, l3 in zip(ebm1.bins_, ebm3.bins_)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Merge the mains, pairs, and triples into a single model</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ebm_triples = merge_ebms([ebm1, ebm2, ebm3])\n",
    "\n",
    "# There is no overlap between these EBMs, so merge_ebms will consider\n",
    "# the non-overlappig terms as having zeros for scores in the other model. \n",
    "# Undo this by multiplying the scores by 3.0. Also reduce the bin_weights_ \n",
    "# since we're merging the same underlying features.\n",
    "for i in range(len(ebm_triples.term_features_)):\n",
    "    ebm_triples.scale(i, 3.0)\n",
    "    ebm_triples.bin_weights_[i] *= 1.0/3.0\n",
    "\n",
    "# add intercepts since we're not trying to average the models\n",
    "ebm_triples.intercept_ = ebm1.intercept_ + ebm2.intercept_ + ebm3.intercept_\n",
    "\n",
    "ebm_triples.bagged_intercept_ = None\n",
    "ebm_triples.bagged_scores_ = None\n",
    "ebm_triples.standard_deviations_ = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>Evaluate the EBMs</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import log_loss\n",
    "\n",
    "loss1 = log_loss(y_test, ebm1.predict_proba(X_test))\n",
    "print(loss1)\n",
    "\n",
    "loss2 = log_loss(y_test, ebm_pairs.predict_proba(X_test))\n",
    "print(loss2)\n",
    "\n",
    "# compare our custom pair EBM with an EBM built to auto-discover the pairs\n",
    "ebm_default = ExplainableBoostingClassifier(random_state=seed, interactions=10)\n",
    "ebm_default.fit(X_train, y_train)\n",
    "loss2_default = log_loss(y_test, ebm_default.predict_proba(X_test))\n",
    "print(loss2_default)\n",
    "\n",
    "loss3 = log_loss(y_test, ebm_triples.predict_proba(X_test))\n",
    "print(loss3)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
